### load already extracted output
dilsOut <- readRDS(paste0(folder_path, "/dilsOut_withoutWeirds.rds"))
analyses <- read.delim(paste0(folder_path, "/analysis_SpLn.txt")) %>%
dplyr::rename(analysisType = analysis, analysis = analysis_pair) %>%
left_join(count(dilsOut, analysis)) %>%
rename(nReps = n) %>%
rownames_to_column("order")
set <- analyses %>%
pivot_longer(sp_species1:sp_species2) %>%
pull(value) %>%
unique()
data <- read.delim(paste0(folder_path, "samples_lineages.txt"))
knitr::kable(data, format = "html") %>%
kable_styling(fixed_thead = T, font_size = 10) %>%
scroll_box(width = "700px", height = "300px")
| sample | species | caste | mitochondrial | datatype_nuc | Lineage |
|---|---|---|---|---|---|
| I37-MSTRQP1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| I38-MSTRQP1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| I39-MSTRQP1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| I53-MSTRQN1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| I54-MSTRQN1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| Mibe | structor | queen | ibericus1 | WGS | ibericus1 |
| NYLS4Q1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| NYLS5Q1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| PASS5Q1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| PASS6M1 | structor | male | ibericus1 | RNA-seq | ibericus1 |
| PASS6Q1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| PASS8Q1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| POLL1Q1 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| RFOU2M | structor | male | ibericus1 | WGS | ibericus1 |
| RINSAM | structor | male | ibericus1 | WGS | ibericus1 |
| RINSAQ | structor | queen | ibericus1 | WGS | ibericus1 |
| SRR4292927 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| SRR4292928 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| SRR4292930 | structor | queen | ibericus1 | RNA-seq | ibericus1 |
| I31-MAEGW | aegyptiacus | worker | mMebe3 | RNA-seq | Maeg |
| Maeg | aegyptiacus | queen | mMebe3 | WGS | Maeg |
| SRR4292920 | barbarus | queen | mMbar1 | RNA-seq | Mbar1 |
| SRR4292931 | barbarus | male | mMbar1 | RNA-seq | Mbar1 |
| SRR4292935 | barbarus | queen | mMbar1 | RNA-seq | Mbar1 |
| SRR4292902 | barbarus | queen | mMbar2 | RNA-seq | Mbar2 |
| SRR4292903 | barbarus | queen | mMbar2 | RNA-seq | Mbar2 |
| SRR4292909 | barbarus | queen | mMbar2 | RNA-seq | Mbar2 |
| SRR4292936 | barbarus | male | mMbar2 | RNA-seq | Mbar2 |
| SRR4292908 | capitatus | queen | capitatus | RNA-seq | Mcap |
| SRR4292910 | capitatus | queen | capitatus | RNA-seq | Mcap |
| I15-MEBEQ13 | ebeninus | queen | mMebe1 | RNA-seq | Mebe |
| I16-MEBEQ9 | ebeninus | queen | mMebe2 | RNA-seq | Mebe |
| I17-MEBEQ23 | ebeninus | queen | mMebe2 | RNA-seq | Mebe |
| I18-MEBEQ5 | ebeninus | queen | mMebe2 | RNA-seq | Mebe |
| I19-MEBEQ19 | ebeninus | queen | mMebe1 | RNA-seq | Mebe |
| I20-MEBEM3 | ebeninus | male | mMebe1 | RNA-seq | Mebe |
| SRR4292914 | ebeninus | queen | mMebe1 | RNA-seq | Mebe |
| Mtar | structor | queen | muticus6 | WGS | muticus6 |
| Y15472-1 | structor | queen | muticus6 | WGS | muticus6 |
| LCR4Q | wasmanni | queen | wasmanni | WGS | Mwas |
| SRR4292932 | wasmanni | queen | wasmanni | RNA-seq | Mwas |
| RDNIPQ | structor | queen | ponticus2 | WGS | ponticus2 |
| RPODGQ | structor | queen | ibericus1 | WGS | ponticus2 |
| SRR4292916 | structor | queen | ponticus2 | RNA-seq | ponticus2 |
| SRR4292917 | structor | queen | ponticus2 | RNA-seq | ponticus2 |
| M8914AQ | structor | queen | structor3 | WGS | structor3 |
| M893AQ | structor | queen | structor3 | WGS | structor3 |
| SRR4292926 | structor | queen | structor3 | RNA-seq | structor3 |
| M883AQ | structor | queen | structor4 | WGS | structor4 |
| M897AQ | structor | queen | structor4 | WGS | structor4 |
samplesData <- data %>%
group_by(Lineage) %>%
tally() %>%
left_join(pivot_longer(data = analyses, cols = sp_species1:sp_species2, values_to = "Lineage"),
by = "Lineage") %>%
select(-Lineage) %>%
pivot_wider(names_from = name, names_sep = "nSamples_{name}", values_from = n)
dilsOut <- left_join(dilsOut, samplesData) %>%
mutate(analysisType = case_when(analysis %in% "Mbar_Mstr" ~ "species", TRUE ~
analysisType))
parameters <- dilsOut %>%
select("/config.yaml") %>%
unnest(cols = c("/config.yaml")) %>%
filter(!V1 %in% c("infile:", "region:", "nspecies:", "nameA:", "nameB:", "nameOutgroup:",
"config_yaml:", "timeStamp:")) %>%
distinct()
knitr::kable(parameters, col.names = c("Parameters", "Values"), format = "html")
| Parameters | Values |
|---|---|
| useSFS: | 1 |
| lightMode: | FALSE |
| population_growth: | variable |
| modeBarrier: | bimodal |
| max_N_tolerated: | 0.1 |
| Lmin: | 30 |
| nMin: | 2 |
| mu: | 3e-09 |
| rho_over_theta: | 0.1 |
| N_min: | 100 |
| N_max: | 1200000 |
| Tsplit_min: | 100 |
| Tsplit_max: | 10000000 |
| M_min: | 0.4 |
| M_max: | 20 |
Should rerun analysis without RPODGQ ponticus (nuclear more like ponticus while mtDNA more like ibericus1).
Should also rerun without weird SRR4292926 structor3 sample (closer to structor3 workers than other two structor3 queens). Not sure if it is a potential structorM like sample or because this is RNAseq, but for other lineages with different type of data the localization in the tree is not so weird as this sample.
Not enough queens to include mcarthuri7. Made some tests comparing ibericus1 and ponticus2 to including structor3+structor4+structor6 as well as the different combinations of two of these lineages.
gof <- dilsOut %>%
select("analysis", "species1", "species2", "rep",
"/gof_2/goodness_of_fit_test.txt") %>% ##expectations based on optimized posterior
group_by(analysis, species1, species2, rep) %>%
unnest(cols = c(`/gof_2/goodness_of_fit_test.txt`)) %>%
select(analysis, stats, pvals_fdr_corrected) %>%
filter(stats %in% c("FST_avg", "FST_std", "netdivAB_avg", "netdivAB_std",
"piA_avg", "piA_std", "piB_avg", "piB_std")) %>%
group_by(analysis, stats) %>%
add_count(sign.pvalue = (pvals_fdr_corrected < 0.05)) %>%
left_join(analyses[,c('analysis','order')]) %>%
arrange(order)
bad <- unique(gof[gof$pvals_fdr_corrected < 0.05, c('rep','analysis')]) %>%
mutate(repanalysis = paste0(rep,analysis))
dilsOutF <- dilsOut %>%
mutate(repanalysis = paste0(rep,analysis)) %>%
filter(!repanalysis %in% bad$repanalysis) %>%
select(-repanalysis)
# filter the analyses which at least one stat had a significant p-value
#### Get best model ####
bestModel <- dilsOutF %>%
select("analysis", "analysisType","species1", "species2", "rep",
'/modelComp/hierarchical_models.txt') %>%
group_by(analysis, species1, species2, rep) %>%
unnest(cols = c(`/modelComp/hierarchical_models.txt`)) %>%
mutate(rows = c("bestModel","bestModelProb")) %>%
pivot_wider(values_from = contains("versus"),
names_from = rows)
bestModelw <- bestModel %>%
pivot_longer(cols = !c(analysis,analysisType,species1,species2, rep),
names_to = 'modelTest') %>%
mutate(bestModel = word(modelTest,2, sep = "_"),
modelTest = word(modelTest,1, sep = "_")) %>%
pivot_wider(names_from = bestModel, values_from = value) %>%
drop_na(bestModel)
bestModelw$modelTest <- factor(bestModelw$modelTest,
levels = c("migration versus isolation",
"AM versus SI", "IM versus SC",
"N-homo versus N-hetero",
"M-homo versus M-hetero"))
colModels <- data.frame(models = c("isolation", "migration","SI", "AM", "IM", "SC","Nhomo","Nhetero","Mhomo","Mhetero"),
cols = c("#5D8937","#95548D","#435F47","#99A565","#7134D3", "#C384C2","#A0A3BC", "#89CEAE", "#7E90C4", "#CD5CBD"))
dilsOutFBest <- dilsOut %>%
mutate(repanalysis = paste0(rep,analysis)) %>%
filter(!repanalysis %in% bad$repanalysis) %>%
select(-repanalysis)
bestReps <- dilsOutFBest %>%
select("analysis","analysisType", "species1", "species2", "rep",
'/modelComp/hierarchical_models.txt') %>%
group_by(analysis, analysisType, species1, species2, rep) %>%
unnest(cols = c(`/modelComp/hierarchical_models.txt`)) %>%
mutate(rows = c("bestModel","bestModelProb")) %>%
filter(rows %in% 'bestModelProb') %>%
select(analysis, analysisType, rep, `migration versus isolation`) %>%
group_by(analysis) %>%
filter(`migration versus isolation` == max(`migration versus isolation`)) %>%
mutate(toFilter = paste0(analysis,rep))
sumStats <- dilsOut %>%
select(analysis, "rep", `/ABCstat_global.txt`) %>%
unnest(cols = c("/ABCstat_global.txt")) %>%
select(analysis, rep, piA_avg, piB_avg, thetaA_avg, thetaB_avg, netdivAB_avg,
divAB_avg)
### Diversity
pis <- sumStats %>%
pivot_longer(cols = piA_avg:thetaB_avg, names_to = "stats") %>%
mutate(lineage = case_when(str_detect(stats, "A_avg") ~ word(analysis, sep = "_",
1), str_detect(stats, "B_avg") ~ word(analysis, sep = "_", 2)), stats = case_when(str_detect(stats,
"pi") ~ "pi_avg", str_detect(stats, "theta") ~ "theta_avg")) %>%
distinct() %>%
mutate(toFilter = paste0(analysis, rep)) %>%
filter(toFilter %in% bestReps$toFilter) %>%
left_join(analyses[, c("analysis", "order")]) %>%
arrange(as.numeric(order))
p <- ggplot(pis) + geom_point(aes(x = value, y = fct_rev(fct_inorder(lineage)), color = stats),
position = position_dodge(width = 0.5), alpha = 0.5) + labs(x = "Genetic diversity",
y = "Lineages") + theme_bw() + theme(legend.position = "right") + scale_color_manual(values = iwanthue(10)[1:2])
#### Get divergences ####
dis <- sumStats %>%
select(analysis, rep, netdivAB_avg, divAB_avg) %>%
rename(dxy_avg = divAB_avg, da_avg = netdivAB_avg) %>%
pivot_longer(cols = da_avg:dxy_avg, names_to = "stats") %>%
mutate(toFilter = paste0(analysis, rep)) %>%
filter(toFilter %in% bestReps$toFilter) %>%
left_join(analyses[, c("analysis", "order")]) %>%
arrange(as.numeric(order))
dxyda <- ggplot(dis) + geom_point(aes(y = fct_rev(fct_inorder(analysis)), x = value,
color = stats, group = stats), position = position_dodge(width = 0.5), alpha = 0.5) +
labs(x = "Divergences", y = "Lineages comparison") + theme_bw() + theme(legend.position = "right") +
scale_color_manual(values = iwanthue(10)[3:4])
ggarrange(p, dxyda, nrow = 2, heights = c(0.8, 1), align = "v")
bestModelwProb <- bestModelw %>%
mutate(toFilter = paste0(analysis, rep)) %>%
filter(toFilter %in% bestReps$toFilter) %>%
mutate(steps = case_when(modelTest %in% "migration versus isolation" ~ "Step 1",
modelTest %in% c("AM versus SI", "IM versus SC") ~ "Step 2", modelTest %in%
c("N-homo versus N-hetero", "M-homo versus M-hetero") ~ "Step 3"), bestModel = factor(bestModel,
levels = c("isolation", "migration", "AM", "IM", "SC", "Nhomo", "Nhetero",
"Mhomo", "Mhetero")))
p <- ggplot(bestModelwProb, aes(y = as.numeric(bestModelProb), x = fct_rev(analysisType),
color = bestModel)) + geom_point() + ggrepel::geom_text_repel(aes(label = analysis),
box.padding = 0.2, size = 2, show.legend = FALSE) + facet_wrap(~steps) + labs(y = "Posterior probability",
x = "") + theme_bw() + scale_color_iwanthue() + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
# scale_color_iwanthue(name = 'Best Model', labels = c('Isolation',
# 'Migration', 'AM', 'IM', 'SC', 'N-homo','N-hetero','M-homo','M-hetero')) +
# scale_shape_manual(values = c(1, 2, 3)) +
p
guide_legend2 <- function(...) ggplot2::guide_legend(...) ##rmarkdown conflict with guide_legend
ggplot(filter(bestModelwProb, !steps %in% "Step 3"), aes(y = as.numeric(bestModelProb),
x = fct_rev(analysisType), color = bestModel, shape = steps)) + geom_point() +
ggrepel::geom_text_repel(aes(label = analysis), size = 2, show.legend = FALSE) +
facet_wrap(~steps) + labs(y = "Posterior probability", x = "") + theme_bw() +
scale_color_iwanthue(name = "Best Model", labels = c("Isolation", "Migration",
"AM", "IM", "SC")) + scale_shape_manual(values = c(1, 2)) + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) + guides(shape = "none", color = guide_legend2(override.aes = list(shape = c(1,
1, 2, 2, 2))))
step23 <- select(bestModelwProb, analysis, bestModel, steps) %>%
filter(steps %in% c("Step 2", "Step 3")) %>%
group_by(analysis) %>%
summarise(bestModel23 = paste0(bestModel, collapse = "_"))
bestModelDxyDa <- bestModelw %>%
filter(modelTest %in% "migration versus isolation") %>%
mutate(problMigration = case_when(bestModel %in% "isolation" ~ 1 - as.numeric(bestModelProb),
TRUE ~ as.numeric(bestModelProb)), toFilter = paste0(analysis, rep)) %>%
filter(toFilter %in% bestReps$toFilter) %>%
select(-modelTest, -bestModelProb) %>%
distinct() %>%
left_join(select(sumStats, analysis, rep, netdivAB_avg, divAB_avg)) %>%
left_join(analyses[, c("analysis", "order")]) %>%
left_join(step23) %>%
arrange(as.numeric(order))
p <- ggplot(bestModelDxyDa, aes(x = divAB_avg, y = problMigration, shape = bestModel23,
color = analysisType)) + geom_point(alpha = 0.7) + ggrepel::geom_text_repel(aes(label = analysis),
box.padding = 0.3, segment.size = 0.3, size = 3, show.legend = FALSE) + scale_color_iwanthue() +
scale_shape_manual(values = 0:6) + labs(y = "Probability of migration", x = "Dxy",
shape = "Best models", color = "Analyses type") + theme_bw() + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) + lims(x = c(0, 0.04), y = c(0, 1))
p
ggsave(paste0(folder_path, "probabilityMigration.pdf"), p, width = 9, height = 6)
p2 <- ggplot(filter(bestModelDxyDa, str_detect(analysis, "pon2|str3")), aes(x = divAB_avg,
y = problMigration, shape = bestModel23)) + geom_point(alpha = 0.7) + ggrepel::geom_text_repel(aes(label = analysis),
box.padding = 0.3, segment.size = 0.3, size = 3, show.legend = FALSE) + scale_shape_manual(values = 0:6) +
labs(y = "Probability of migration", x = "Dxy", shape = "Best models") + theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
lims(x = c(0, 0.04), y = c(0, 1))
p2
ggsave(paste0(folder_path, "probabilityMigration_weirdsWithout.pdf"), p, width = 9,
height = 6)
PCA <- dilsOut %>%
select("analysis", "species1", "species2", "analysisType", "rep", "/table_coord_PCA_SS.txt") %>%
unnest(cols = c("/table_coord_PCA_SS.txt")) %>%
mutate(toFilter = paste0(analysis, rep)) %>%
filter(toFilter %in% bestReps$toFilter)
eig <- dilsOut %>%
select("analysis", "analysisType", "species1", "species2", "rep", "/table_eigenvalues_PCA_SS.txt") %>%
unnest(cols = c("/table_eigenvalues_PCA_SS.txt")) %>%
mutate(toFilter = paste0(analysis, rep)) %>%
filter(toFilter %in% bestReps$toFilter, V1 %in% c("comp 1", "comp 2")) %>%
select(-toFilter, -eigenvalue, -`cumulative percentage of variance`) %>%
pivot_wider(names_from = V1, values_from = `percentage of variance`)
pls <- purrr::map(which(eig$analysisType %in% "lineages"), ~ggplot(filter(PCA, analysis %in%
unique(PCA$analysis)[.x]), aes(x = Dim.1, y = Dim.2, color = origin)) + geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) + geom_point() + theme_bw() + geom_point(data = filter(PCA,
analysis %in% unique(PCA$analysis)[.x], origin %in% "observed dataset"), aes(x = Dim.1,
y = Dim.2), color = "#D7191C") + scale_colour_brewer(palette = "Spectral", name = "") +
labs(title = analyses[analyses$analysis %in% unique(PCA$analysis)[.x], "analysis"],
x = paste0("PC1 (", round(pull(eig[eig$analysis %in% unique(PCA$analysis)[.x],
"comp 1"]), 2), "%)"), y = paste0("PC2 (", round(pull(eig[eig$analysis %in%
unique(PCA$analysis)[.x], "comp 2"]), 2), "%)")) + theme(plot.title = element_text(size = 9)))
q <- ggarrange(plotlist = pls, common.legend = TRUE, nrow = 4, ncol = 4)
q
pls <- purrr::map(which(eig$analysisType %in% "species"), ~ggplot(filter(PCA, analysis %in%
unique(PCA$analysis)[.x]), aes(x = Dim.1, y = Dim.2, color = origin)) + geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) + geom_point() + theme_bw() + geom_point(data = filter(PCA,
analysis %in% unique(PCA$analysis)[.x], origin %in% "observed dataset"), aes(x = Dim.1,
y = Dim.2), color = "#D7191C") + scale_colour_brewer(palette = "Spectral", name = "") +
labs(title = analyses[analyses$analysis %in% unique(PCA$analysis)[.x], "analysis"],
x = paste0("PC1 (", round(pull(eig[eig$analysis %in% unique(PCA$analysis)[.x],
"comp 1"]), 2), "%)"), y = paste0("PC2 (", round(pull(eig[eig$analysis %in%
unique(PCA$analysis)[.x], "comp 2"]), 2), "%)")) + theme(plot.title = element_text(size = 9)))
q <- ggarrange(plotlist = pls, common.legend = TRUE, nrow = 2, ncol = 4)
q
Population sizes
* Na: effective size of the ancestral population [# of diploid individuals]
* N1; N2: effective size of population 1 (resp. 2) [# of diploid individuals]
If Ne is genomically heterogeneous
* shape_N_a; shape_N_b: shape parameter α (resp. β) of the Beta(α, β) distribution for Ne (shared by all populations)
If there is a demographic change
* Tdem1; Tdem2: time of the demographic change in population 1 (resp. 2) [# of generations]
* founders1; founders2: number of founder individuals in population 1 (resp. 2) at the time of the demographic change Tdem1 (and Tdem2)
Times for two populations or more
* Tsplit: time of split at which the ancestral population subdivides in two populations [# of generations]
If there is both gene flow and isolation
* Tsc: time of secondary contact at which the two populations start exchanging genes [# of generations]
* Tam: time of ancient migration at which the two populations stop exchanging genes [# of generations]
Migration and barriers
* M12; M21: introgression rate from population 2 to 1 (resp. from 1 to 2) [# of migrants per generation]
If N.m is genomically heterogeneous (bimodal model)
* nBarriersM12; nBarriersM21: number of loci inferred as interspecies barriers for introgression from population 2 to 1 (resp. from 1 to 2)
If N.m is genomically heterogeneous (beta model)
* shape_M12_a; shape_M12_b: shape parameter α (resp. β) of the Beta(α, β) distribution for N.m (from population 2 to 1)
* shape_M21_a; shape_M21_b: shape parameter α (resp. β) of the Beta(α β) distribution for N.m (from population 1 to 2)
estPar <- dilsOut %>%
select("analysis", "analysisType", "species1", "species2", "rep", "/best_model_5/posterior_summary_RandomForest_bestModel.txt") %>%
group_by(analysis) %>%
unnest(cols = c(`/best_model_5/posterior_summary_RandomForest_bestModel.txt`)) %>%
left_join(., bestModel[c("analysis", "rep", "migration versus isolation_bestModel")],
by = c("analysis", "rep")) %>%
rename(step1 = "migration versus isolation_bestModel") %>%
distinct() %>%
left_join(analyses[, c("analysis", "order")]) %>%
arrange(as.numeric(order)) %>%
mutate(toFilter = paste0(analysis, rep)) %>%
filter(toFilter %in% bestReps$toFilter)
estPar$param_f = factor(estPar$param, levels = c("Na", "N1", "N2", "founders1", "founders2",
"Tsplit", "Tam", "Tsc", "Tdem1", "Tdem2", "M12", "M21", "shape_N_a", "shape_N_b",
"nBarriersM12", "nBarriersM21"))
pars <- ggplot(estPar, aes(y = fct_rev(fct_inorder(analysis)), x = median, color = analysisType)) +
geom_point(size = 1) + geom_linerange(aes(xmin = `HPD2.5%`, xmax = `HPD%97.5`)) +
facet_wrap(~param_f, ncol = 4, scales = "free_x") + theme_bw() + labs(y = "Lineages comparison",
x = "Estimated parameters", color = "Analyses type") + theme(legend.position = "bottom",
axis.text.y = element_text(size = 5)) + scale_color_iwanthue()
pars